1 Setup

setwd("/mnt/picea/projects/aspseq/jfelten/T89-Laccaria-bicolor/January/Salmon")
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(gplots))
suppressPackageStartupMessages(library(hyperSpec))
suppressPackageStartupMessages(library(parallel))
suppressPackageStartupMessages(library(pander))
suppressPackageStartupMessages(library(plotly))
suppressPackageStartupMessages(library(RColorBrewer))
suppressPackageStartupMessages(library(scatterplot3d))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(tximport))
suppressPackageStartupMessages(library(vsn))
source("~/Git/UPSCb/src/R/plot.multidensity.R")
source("~/Git/UPSCb/src/R/featureSelection.R")
pal <- brewer.pal(8,"Dark2")
hpal <- colorRampPalette(c("blue","white","red"))(100)
mar <- par("mar")
samples <- read_csv("~/Git/UPSCb/projects/T89-Laccaria-bicolor/doc/Samples.csv") %>% 
  mutate(Time=factor(Time)) %>% 
  mutate(Experiment=factor(Experiment))
## Parsed with column specification:
## cols(
##   SciLifeID = col_character(),
##   SampleName = col_double(),
##   Time = col_double(),
##   Experiment = col_character(),
##   Replicate = col_character()
## )

tx2gene translation

Potra.tx2gene <- read_delim("/mnt/picea/storage/reference/Populus-tremula/v1.1/annotation/tx2gene.tsv",
                            "\t",col_names=c("TXID","GENE"))
## Parsed with column specification:
## cols(
##   TXID = col_character(),
##   GENE = col_character()
## )
Potri.tx2gene <- read_delim("/mnt/picea/storage/reference/Populus-trichocarpa/v3.0/annotation/tx2gene.tsv",
                            "\t",col_names=c("TXID","GENE"))
## Parsed with column specification:
## cols(
##   TXID = col_character(),
##   GENE = col_character()
## )

2 Laccaria bicolor data

2.1 Raw data

lb.filelist <- list.files("Lacbi2", 
                    recursive = TRUE, 
                    pattern = "quant.sf",
                    full.names = TRUE)

Select the samples containing fungi

stopifnot(all(str_which(basename(lb.filelist),samples$SciLifeID) == 1:nrow(samples)))
names(lb.filelist) <- samples$SciLifeID
lb.filelist <- lb.filelist[samples$Experiment %in% c("ECM","FLM")]

Read the expression at the gene level (there is one transcript per gene)

lb.g <- suppressMessages(tximport(files = lb.filelist, 
                                type = "salmon",txOut=TRUE))

counts <- round(lb.g$counts)

2.2 Raw Data QC analysis

Check how many genes are never expressed

sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
        round(sum(sel) * 100/ nrow(counts),digits=1),
        sum(sel),
        nrow(counts))
## [1] "24.1% percent (5507) of 22822 genes are not expressed"
ggplot(tibble(x=colnames(counts),y=colSums(counts)) %>% 
         bind_cols(samples[match(names(lb.filelist),samples$SciLifeID),]),
       aes(x,y,col=Experiment,fill=Time)) + geom_col() + 
  scale_y_continuous(name="reads") +
  theme(axis.text.x=element_text(angle=90),axis.title.x=element_blank())

Display the per-gene mean expression

i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.

The cumulative gene coverage is as expected

ggplot(melt(log10(rowMeans(counts))),aes(x=value)) + 
  geom_density() + ggtitle("gene mean raw counts distribution") +
  scale_x_continuous(name="mean raw counts (log10)")
## Warning: Removed 5507 rows containing non-finite values (stat_density).

The same is done for the individual samples colored by condition. The gene coverage across samples is extremely similar

dat <- as.data.frame(log10(counts)) %>% utils::stack() %>% 
  mutate(Experiment=samples$Experiment[match(ind,samples$SciLifeID)]) %>% 
  mutate(Time=samples$Time[match(ind,samples$SciLifeID)])

Color by Experiment

ggplot(dat,aes(x=values,group=ind,col=Experiment)) + 
  geom_density() + ggtitle("sample raw counts distribution") +
  scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 305984 rows containing non-finite values (stat_density).

Color by Time

ggplot(dat,aes(x=values,group=ind,col=Time)) + 
  geom_density() + ggtitle("sample raw counts distribution") +
  scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 305984 rows containing non-finite values (stat_density).

2.3 Export

dir.create(file.path("..","analysis","salmon"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file="../analysis/salmon/Lacbi-raw-unormalised-gene-expression_data.csv")

2.4 Data normalisation

2.4.1 Preparation

For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate

s.sel <- match(colnames(counts),samples$SciLifeID)
dds <- DESeqDataSetFromMatrix(
  countData = counts,
  colData = samples[s.sel,],
  design = ~ Experiment * Time)
## converting counts to integer mode
## factor levels were dropped which had no samples
save(dds,file="../analysis/salmon/Lacbi-dds.rda")

Check the size factors (i.e. the sequencing library size effect)

The sequencing depth is relatively variable (40 to 260 %)

dds <- estimateSizeFactors(dds)
sizes <- sizeFactors(dds)
pander(sizes)
Table continues below
P11915_101 P11915_102 P11915_103 P11915_104 P11915_105 P11915_106
1.032 1.463 0.8219 0.4535 0.9587 0.9555
Table continues below
P11915_107 P11915_112 P11915_113 P11915_114 P11915_115 P11915_116
1.016 0.919 0.8393 0.8228 0.8478 0.7025
Table continues below
P11915_117 P11915_118 P11915_123 P11915_124 P11915_125 P11915_126
1.989 1.768 1.237 1.177 1.07 0.4598
Table continues below
P11915_127 P11915_128 P11915_129 P11915_134 P11915_135 P11915_136
0.8611 1.232 1.304 2.177 0.8829 1.414
Table continues below
P11915_137 P11915_138 P11915_139 P11915_140 P11915_145 P11915_146
0.3045 0.8093 1.163 0.7178 1.904 2.071
P11915_147 P11915_148 P11915_149 P11915_150 P11915_151
2.574 0.9657 0.9867 0.4749 1.045
boxplot(sizes, main="Sequencing libraries size factor")

2.5 Variance Stabilising Transformation

vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)
  • Validation

The variance stabilisation worked very well, the data is nearly homoscesdastic

meanSdPlot(vst[rowSums(counts)>0,])

2.6 QC on the normalised data

2.6.1 PCA

pc <- prcomp(t(vst))
  
percent <- round(summary(pc)$importance[2,]*100)

2.6.2 3 first dimensions

This looks interesting as the sample separate clearly both by Experiment and Time in the first 2 components.

mar=c(5.1,4.1,4.1,2.1)
scatterplot3d(pc$x[,1],
              pc$x[,2],
              pc$x[,3],
              xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
              ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
              zlab=paste("Comp. 3 (",percent[3],"%)",sep=""),
              color=pal[as.integer(samples$Time)[s.sel]],
              pch=c(17,19)[as.integer(samples$Experiment)[s.sel]-1])
legend("topleft",
       fill=pal[1:nlevels(samples$Time)],
       legend=levels(samples$Time))
legend("bottomright",
       pch=c(17,19),
       legend=levels(samples$Experiment)[-1])

par(mar=mar)

2.6.3 2D

pc.dat <- bind_cols(PC1=pc$x[,1],
                    PC2=pc$x[,2],
                    samples[s.sel,])

ggplot(pc.dat,aes(x=PC1,y=PC2,col=Time,shape=Experiment)) + 
  geom_point(size=2) + 
  ggtitle("Principal Component Analysis",subtitle="variance stabilized counts") +
  scale_x_continuous(name=element_text(paste("PC1 (",percent[1],"%)",sep=""))) +
  scale_y_continuous(name=element_text(paste("PC2 (",percent[2],"%)",sep="")))

2.6.4 Heatmap

Filter for noise A cutoff at a VST value of 1 leaves about 15000 genes, which is adequate for the QA

conds <- factor(paste(samples$Experiment,samples$Time))[s.sel]
sels <- rangeFeatureSelect(counts=vst,
                   conditions=conds,
                   nrep=3)

  • Heatmap of “all” genes Taking into account all the genes (above a noise thresholds), the samples cluster according to what we also see in the mapping rate plot, i.e. there is a correlation with the amount of sequences in the samples.
heatmap.2(t(scale(t(vst[sels[[2]],]))),
          distfun=pearson.dist,
          hclustfun=function(X){hclust(X,method="ward.D2")},
          labRow = NA,trace = "none",
          labCol = conds,
          col=hpal)

  • Heatmap of the 1000 most variable genes
ord <- order(rowSds(vst[sels[[2]],]),decreasing=TRUE) [1:1000]

The subset is enriched for higher expression values

ggplot(list(sub=rowMeans(vst[sels[[2]],][ord,]),
            total=rowMeans(vst[sels[[2]],])) %>%  melt(),
       aes(x=value,col=L1)) + 
  geom_density() +
  ggtitle("Density of the subset vs. overall") + 
  scale_x_continuous(name=element_text("VST expression")) + 
  theme(legend.title=element_blank())

The clustering remains the same even for the most variable genes

heatmap.2(t(scale(t(vst[sels[[2]],][ord,]))),
          distfun=pearson.dist,
          hclustfun=function(X){hclust(X,method="ward.D2")},
          labRow = NA,trace = "none",
          labCol = conds,
          col=hpal)

  • Heatmap of the 1000 least variable genes
ord <- order(rowSds(vst[sels[[2]],])) [1:1000]

The subset is enriched for higher expression values, with a strinkingly normal distribution

ggplot(list(sub=rowMeans(vst[sels[[2]],][ord,]),
            total=rowMeans(vst[sels[[2]],])) %>%  melt(),
       aes(x=value,col=L1)) + 
  geom_density() +
  ggtitle("Density of the subset vs. overall") + 
  scale_x_continuous(name=element_text("VST expression")) + 
  theme(legend.title=element_blank())

The clustering for the least variable genes shows a separation by experiment and time point

heatmap.2(t(scale(t(vst[sels[[2]],][ord,]))),
          distfun=pearson.dist,
          hclustfun=function(X){hclust(X,method="ward.D2")},
          labRow = NA,trace = "none",
          labCol = conds,
          col=hpal)

2.7 Conclusion

The quality of the data is good. The PCA shows that the samples cluster by experiment and time, however, the heatmap shows a clustering that correlates with the mapping rates, i.e. the mixed amount of reads originating from either organism. The final heatmap seem to indicate that this effect is neglectable albeit confounded.

3 T89 ( Populus tremula x Populus tremuloides ) data

3.1 Raw data

pt.filelist <- list.files("Potra01", 
                          recursive = TRUE, 
                          pattern = "quant.sf",
                          full.names = TRUE)

Select the samples containing fungi

stopifnot(all(str_which(basename(pt.filelist),samples$SciLifeID) == 1:nrow(samples)))
names(pt.filelist) <- samples$SciLifeID
pt.filelist <- pt.filelist[samples$Experiment %in% c("Cont","ECM")]

Read the expression at the gene level (there is one transcript per gene)

pt.g <- suppressMessages(tximport(files = pt.filelist, 
                                  type = "salmon",
                                  tx2gene=Potra.tx2gene))

counts <- round(pt.g$counts)

3.2 Raw Data QC analysis

Check how many genes are never expressed

sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
        round(sum(sel) * 100/ nrow(counts),digits=1),
        sum(sel),
        nrow(counts))
## [1] "13.4% percent (4547) of 34043 genes are not expressed"
ggplot(tibble(x=colnames(counts),y=colSums(counts)) %>% 
         bind_cols(samples[match(names(pt.filelist),samples$SciLifeID),]),
       aes(x,y,col=Experiment,fill=Time)) + geom_col() + 
  scale_y_continuous(name="reads") +
  theme(axis.text.x=element_text(angle=90),axis.title.x=element_blank())

Display the per-gene mean expression

i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.

The cumulative gene coverage is as expected

ggplot(melt(log10(rowMeans(counts))),aes(x=value)) + 
  geom_density() + ggtitle("gene mean raw counts distribution") +
  scale_x_continuous(name="mean raw counts (log10)")
## Warning: Removed 4547 rows containing non-finite values (stat_density).

The same is done for the individual samples colored by condition. The gene coverage across samples is extremely similar

dat <- as.data.frame(log10(counts)) %>% utils::stack() %>% 
  mutate(Experiment=samples$Experiment[match(ind,samples$SciLifeID)]) %>% 
  mutate(Time=samples$Time[match(ind,samples$SciLifeID)])

Color by Experiment

ggplot(dat,aes(x=values,group=ind,col=Experiment)) + 
  geom_density() + ggtitle("sample raw counts distribution") +
  scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 521762 rows containing non-finite values (stat_density).

Color by Time

ggplot(dat,aes(x=values,group=ind,col=Time)) + 
  geom_density() + ggtitle("sample raw counts distribution") +
  scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 521762 rows containing non-finite values (stat_density).

3.3 Export

dir.create(file.path("..","analysis","salmon"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file="../analysis/salmon/Potra-raw-unormalised-gene-expression_data.csv")

3.4 Data normalisation

3.4.1 Preparation

For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate

s.sel <- match(colnames(counts),samples$SciLifeID)
dds <- DESeqDataSetFromMatrix(
  countData = counts,
  colData = samples[s.sel,],
  design = ~ Experiment * Time)
## converting counts to integer mode
## factor levels were dropped which had no samples
save(dds,file="../analysis/salmon/Potra-dds.rda")

Check the size factors (i.e. the sequencing library size effect)

The sequencing depth is highly variable (4 to 900 %)

dds <- estimateSizeFactors(dds)
sizes <- sizeFactors(dds)
pander(sizes)
Table continues below
P11915_104 P11915_105 P11915_106 P11915_107 P11915_108 P11915_109
0.04633 0.1874 0.1125 0.2241 1.807 1.466
Table continues below
P11915_110 P11915_111 P11915_115 P11915_116 P11915_117 P11915_118
1.283 0.8275 0.1309 0.1461 0.1667 0.3031
Table continues below
P11915_119 P11915_120 P11915_121 P11915_122 P11915_126 P11915_127
2.871 3.802 3.454 2.553 0.9284 0.2632
Table continues below
P11915_128 P11915_129 P11915_130 P11915_131 P11915_132 P11915_133
0.9129 0.8084 4.439 2.245 2.316 3.72
Table continues below
P11915_137 P11915_138 P11915_139 P11915_140 P11915_141 P11915_142
0.7642 0.4624 0.2597 0.1985 2.102 6.673
Table continues below
P11915_143 P11915_144 P11915_148 P11915_149 P11915_150 P11915_151
2.986 2.643 1.889 0.7019 1.433 0.9264
P11915_152 P11915_153 P11915_154 P11915_155
3.792 6.972 9.011 2.88
boxplot(sizes, main="Sequencing libraries size factor")

3.5 Variance Stabilising Transformation

vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)
  • Validation

The variance stabilisation worked adequately

meanSdPlot(vst[rowSums(counts)>0,])

3.6 QC on the normalised data

3.6.1 PCA

pc <- prcomp(t(vst))

percent <- round(summary(pc)$importance[2,]*100)

3.6.2 3 first dimensions

This looks interesting as the sample separate also both by Experiment and Time in the first 2 components, but somewhat not as clearly as for Laccaria bicolor

mar=c(5.1,4.1,4.1,2.1)
scatterplot3d(pc$x[,1],
              pc$x[,2],
              pc$x[,3],
              xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
              ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
              zlab=paste("Comp. 3 (",percent[3],"%)",sep=""),
              color=pal[as.integer(samples$Time)[s.sel]],
              pch=c(17,19)[as.integer(samples$Experiment)[s.sel]])
legend("topleft",
       fill=pal[1:nlevels(samples$Time)],
       legend=levels(samples$Time))
legend("bottomright",
       pch=c(17,19),
       legend=levels(samples$Experiment)[-3])

par(mar=mar)

3.6.3 2D

The sample P11915_104 is an outlier

pc.dat <- bind_cols(PC1=pc$x[,1],
                    PC2=pc$x[,2],
                    samples[s.sel,])

p <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=Time,shape=Experiment)) + 
  geom_point(size=2) + 
  ggtitle("Principal Component Analysis",subtitle="variance stabilized counts") +
  scale_x_continuous(name=element_text(paste("PC1 (",percent[1],"%)",sep=""))) +
  scale_y_continuous(name=element_text(paste("PC2 (",percent[2],"%)",sep="")))

ggplotly(p)
## Warning in if (nchar(axisTitleText) > 0) {: the condition has length > 1
## and only the first element will be used

## Warning in if (nchar(axisTitleText) > 0) {: the condition has length > 1
## and only the first element will be used

3.6.4 Heatmap

Filter for noise A cutoff at a VST value of 2 leaves about 14000 genes, which is adequate for the QA

conds <- factor(paste(samples$Experiment,samples$Time))[s.sel]
sels <- rangeFeatureSelect(counts=vst,
                           conditions=conds,
                           nrep=3)

  • Heatmap of “all” genes Taking into account all the genes (above a noise thresholds), the samples cluster according to what we also see in the mapping rate plot, i.e. there is a correlation with the amount of sequences in the samples, but it is not as striking as for Laccaria bicolor. There is clustering by time, especially for the earlier time points. The later ECM time points are closer to the earlier ECM samples, while the later control cluster together, apart from some outliers.
heatmap.2(t(scale(t(vst[sels[[3]],]))),
          distfun=pearson.dist,
          hclustfun=function(X){hclust(X,method="ward.D2")},
          labRow = NA,trace = "none",
          labCol = conds,
          col=hpal)

  • Heatmap of the 1000 most variable genes
ord <- order(rowSds(vst[sels[[2]],]),decreasing=TRUE) [1:1000]

The subset is enriched for higher expression values

ggplot(list(sub=rowMeans(vst[sels[[2]],][ord,]),
            total=rowMeans(vst[sels[[2]],])) %>%  melt(),
       aes(x=value,col=L1)) + 
  geom_density() +
  ggtitle("Density of the subset vs. overall") + 
  scale_x_continuous(name=element_text("VST expression")) + 
  theme(legend.title=element_blank())

The clustering shows a better clustering by time and experiment, even though there are still outliers

heatmap.2(t(scale(t(vst[sels[[2]],][ord,]))),
          distfun=pearson.dist,
          hclustfun=function(X){hclust(X,method="ward.D2")},
          labRow = NA,trace = "none",
          labCol = conds,
          col=hpal)

  • Heatmap of the 1000 least variable genes
ord <- order(rowSds(vst[sels[[2]],])) [1:1000]

The subset is enriched for higher expression values, with a strinkingly normal distribution

ggplot(list(sub=rowMeans(vst[sels[[2]],][ord,]),
            total=rowMeans(vst[sels[[2]],])) %>%  melt(),
       aes(x=value,col=L1)) + 
  geom_density() +
  ggtitle("Density of the subset vs. overall") + 
  scale_x_continuous(name=element_text("VST expression")) + 
  theme(legend.title=element_blank())

The clustering for the least variable genes is very similar to the overall profile

heatmap.2(t(scale(t(vst[sels[[2]],][ord,]))),
          distfun=pearson.dist,
          hclustfun=function(X){hclust(X,method="ward.D2")},
          labRow = NA,trace = "none",
          labCol = conds,
          col=hpal)

3.7 Conclusion

The same conclusion as for Laccaria bicolor apply, despite the difference in the heatmap behaviour that may indicate some bias due to the organisms.

4 _Populus trichocarpa_data ## Raw data

pt.filelist <- list.files("Potri03", 
                          recursive = TRUE, 
                          pattern = "quant.sf",
                          full.names = TRUE)

Select the samples containing fungi

stopifnot(all(str_which(basename(pt.filelist),samples$SciLifeID) == 1:nrow(samples)))
names(pt.filelist) <- samples$SciLifeID
pt.filelist <- pt.filelist[samples$Experiment %in% c("Cont","ECM")]

Read the expression at the gene level (there is one transcript per gene)

pt.g <- suppressMessages(tximport(files = pt.filelist, 
                                  type = "salmon",
                                  tx2gene=Potri.tx2gene))

counts <- round(pt.g$counts)

4.1 Raw Data QC analysis

Check how many genes are never expressed

sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
        round(sum(sel) * 100/ nrow(counts),digits=1),
        sum(sel),
        nrow(counts))
## [1] "18.7% percent (7719) of 41270 genes are not expressed"
ggplot(tibble(x=colnames(counts),y=colSums(counts)) %>% 
         bind_cols(samples[match(names(pt.filelist),samples$SciLifeID),]),
       aes(x,y,col=Experiment,fill=Time)) + geom_col() + 
  scale_y_continuous(name="reads") +
  theme(axis.text.x=element_text(angle=90),axis.title.x=element_blank())

Display the per-gene mean expression

i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.

The cumulative gene coverage is as expected

ggplot(melt(log10(rowMeans(counts))),aes(x=value)) + 
  geom_density() + ggtitle("gene mean raw counts distribution") +
  scale_x_continuous(name="mean raw counts (log10)")
## Warning: Removed 7719 rows containing non-finite values (stat_density).

The same is done for the individual samples colored by condition. The gene coverage across samples is extremely similar

dat <- as.data.frame(log10(counts)) %>% utils::stack() %>% 
  mutate(Experiment=samples$Experiment[match(ind,samples$SciLifeID)]) %>% 
  mutate(Time=samples$Time[match(ind,samples$SciLifeID)])

Color by Experiment

ggplot(dat,aes(x=values,group=ind,col=Experiment)) + 
  geom_density() + ggtitle("sample raw counts distribution") +
  scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 782447 rows containing non-finite values (stat_density).

Color by Time

ggplot(dat,aes(x=values,group=ind,col=Time)) + 
  geom_density() + ggtitle("sample raw counts distribution") +
  scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 782447 rows containing non-finite values (stat_density).

4.2 Export

dir.create(file.path("..","analysis","salmon"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file="../analysis/salmon/Potri-raw-unormalised-gene-expression_data.csv")

4.3 Data normalisation

4.3.1 Preparation

For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate

s.sel <- match(colnames(counts),samples$SciLifeID)
dds <- DESeqDataSetFromMatrix(
  countData = counts,
  colData = samples[s.sel,],
  design = ~ Experiment * Time)
## converting counts to integer mode
## factor levels were dropped which had no samples
save(dds,file="../analysis/salmon/Potri-dds.rda")

Check the size factors (i.e. the sequencing library size effect)

The sequencing depth is highly variable (4 to 900 %)

dds <- estimateSizeFactors(dds)
sizes <- sizeFactors(dds)
pander(sizes)
Table continues below
P11915_104 P11915_105 P11915_106 P11915_107 P11915_108 P11915_109
0.04798 0.1884 0.1159 0.2284 1.8 1.467
Table continues below
P11915_110 P11915_111 P11915_115 P11915_116 P11915_117 P11915_118
1.285 0.8358 0.1348 0.1456 0.167 0.3027
Table continues below
P11915_119 P11915_120 P11915_121 P11915_122 P11915_126 P11915_127
2.823 3.833 3.487 2.574 0.9294 0.2522
Table continues below
P11915_128 P11915_129 P11915_130 P11915_131 P11915_132 P11915_133
0.9169 0.8099 4.363 2.273 2.321 3.686
Table continues below
P11915_137 P11915_138 P11915_139 P11915_140 P11915_141 P11915_142
0.7899 0.4734 0.2551 0.199 2.041 6.628
Table continues below
P11915_143 P11915_144 P11915_148 P11915_149 P11915_150 P11915_151
3.026 2.578 1.88 0.6946 1.435 0.9353
P11915_152 P11915_153 P11915_154 P11915_155
3.739 6.844 9.038 2.841
boxplot(sizes, main="Sequencing libraries size factor")

4.4 Variance Stabilising Transformation

vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)
  • Validation

The variance stabilisation worked adequately

meanSdPlot(vst[rowSums(counts)>0,])

4.5 QC on the normalised data

4.5.1 PCA

pc <- prcomp(t(vst))

percent <- round(summary(pc)$importance[2,]*100)

4.5.2 3 first dimensions

This looks interesting as the sample separate also both by Experiment and Time in the first 2 components, but somewhat not as clearly as for Laccaria bicolor

mar=c(5.1,4.1,4.1,2.1)
scatterplot3d(pc$x[,1],
              pc$x[,2],
              pc$x[,3],
              xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
              ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
              zlab=paste("Comp. 3 (",percent[3],"%)",sep=""),
              color=pal[as.integer(samples$Time)[s.sel]],
              pch=c(17,19)[as.integer(samples$Experiment)[s.sel]])
legend("topleft",
       fill=pal[1:nlevels(samples$Time)],
       legend=levels(samples$Time))
legend("bottomright",
       pch=c(17,19),
       legend=levels(samples$Experiment)[-3])

par(mar=mar)

4.5.3 2D

The sample P11915_104 is an outlier

pc.dat <- bind_cols(PC1=pc$x[,1],
                    PC2=pc$x[,2],
                    samples[s.sel,])

p <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=Time,shape=Experiment)) + 
  geom_point(size=2) + 
  ggtitle("Principal Component Analysis",subtitle="variance stabilized counts") +
  scale_x_continuous(name=element_text(paste("PC1 (",percent[1],"%)",sep=""))) +
  scale_y_continuous(name=element_text(paste("PC2 (",percent[2],"%)",sep="")))

ggplotly(p)
## Warning in if (nchar(axisTitleText) > 0) {: the condition has length > 1
## and only the first element will be used

## Warning in if (nchar(axisTitleText) > 0) {: the condition has length > 1
## and only the first element will be used

4.5.4 Heatmap

Filter for noise A cutoff at a VST value of 2 leaves about 14000 genes, which is adequate for the QA

conds <- factor(paste(samples$Experiment,samples$Time))[s.sel]
sels <- rangeFeatureSelect(counts=vst,
                           conditions=conds,
                           nrep=3)

  • Heatmap of “all” genes Taking into account all the genes (above a noise thresholds), the samples cluster according to what we also see in the mapping rate plot, i.e. there is a correlation with the amount of sequences in the samples, but it is not as striking as for Laccaria bicolor. There is clustering by time, especially for the earlier time points. The later ECM time points are closer to the earlier ECM samples, while the later control cluster together, apart from some outliers.
heatmap.2(t(scale(t(vst[sels[[3]],]))),
          distfun=pearson.dist,
          hclustfun=function(X){hclust(X,method="ward.D2")},
          labRow = NA,trace = "none",
          labCol = conds,
          col=hpal)

  • Heatmap of the 1000 most variable genes
ord <- order(rowSds(vst[sels[[2]],]),decreasing=TRUE) [1:1000]

The subset is enriched for higher expression values

ggplot(list(sub=rowMeans(vst[sels[[2]],][ord,]),
            total=rowMeans(vst[sels[[2]],])) %>%  melt(),
       aes(x=value,col=L1)) + 
  geom_density() +
  ggtitle("Density of the subset vs. overall") + 
  scale_x_continuous(name=element_text("VST expression")) + 
  theme(legend.title=element_blank())

The clustering shows a better clustering by time and experiment, even though there are still outliers

heatmap.2(t(scale(t(vst[sels[[2]],][ord,]))),
          distfun=pearson.dist,
          hclustfun=function(X){hclust(X,method="ward.D2")},
          labRow = NA,trace = "none",
          labCol = conds,
          col=hpal)

  • Heatmap of the 1000 least variable genes
ord <- order(rowSds(vst[sels[[2]],])) [1:1000]

The subset is enriched for higher expression values, with a strinkingly normal distribution

ggplot(list(sub=rowMeans(vst[sels[[2]],][ord,]),
            total=rowMeans(vst[sels[[2]],])) %>%  melt(),
       aes(x=value,col=L1)) + 
  geom_density() +
  ggtitle("Density of the subset vs. overall") + 
  scale_x_continuous(name=element_text("VST expression")) + 
  theme(legend.title=element_blank())

The clustering for the least variable genes is very similar to the overall profile

heatmap.2(t(scale(t(vst[sels[[2]],][ord,]))),
          distfun=pearson.dist,
          hclustfun=function(X){hclust(X,method="ward.D2")},
          labRow = NA,trace = "none",
          labCol = conds,
          col=hpal)

4.6 Conclusion

The same conclusion as for Laccaria bicolor apply, despite the difference in the heatmap behaviour that may indicate some bias due to the organisms.

5 Session Info

## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.3 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
##  [1] grid      parallel  stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] vsn_3.52.0                  tximport_1.12.3            
##  [3] forcats_0.4.0               stringr_1.4.0              
##  [5] dplyr_0.8.3                 purrr_0.3.2                
##  [7] readr_1.3.1                 tidyr_0.8.3                
##  [9] tibble_2.1.3                tidyverse_1.2.1            
## [11] scatterplot3d_0.3-41        RColorBrewer_1.1-2         
## [13] plotly_4.9.0                pander_0.6.3               
## [15] hyperSpec_0.99-20180627     ggplot2_3.2.1              
## [17] lattice_0.20-38             gplots_3.0.1.1             
## [19] DESeq2_1.24.0               SummarizedExperiment_1.14.1
## [21] DelayedArray_0.10.0         BiocParallel_1.18.1        
## [23] matrixStats_0.54.0          Biobase_2.44.0             
## [25] GenomicRanges_1.36.0        GenomeInfoDb_1.20.0        
## [27] IRanges_2.18.2              S4Vectors_0.22.0           
## [29] BiocGenerics_0.30.0         data.table_1.12.2          
## 
## loaded via a namespace (and not attached):
##   [1] colorspace_1.4-1       htmlTable_1.13.1       XVector_0.24.0        
##   [4] base64enc_0.1-3        rstudioapi_0.10        hexbin_1.27.3         
##   [7] affyio_1.54.0          bit64_0.9-7            AnnotationDbi_1.46.1  
##  [10] lubridate_1.7.4        xml2_1.2.2             splines_3.6.1         
##  [13] geneplotter_1.62.0     knitr_1.24             zeallot_0.1.0         
##  [16] Formula_1.2-3          jsonlite_1.6           broom_0.5.2           
##  [19] annotate_1.62.0        cluster_2.1.0          shiny_1.3.2           
##  [22] BiocManager_1.30.4     compiler_3.6.1         httr_1.4.1            
##  [25] backports_1.1.4        assertthat_0.2.1       Matrix_1.2-17         
##  [28] lazyeval_0.2.2         limma_3.40.6           cli_1.1.0             
##  [31] later_0.8.0            acepack_1.4.1          htmltools_0.3.6       
##  [34] tools_3.6.1            gtable_0.3.0           glue_1.3.1            
##  [37] GenomeInfoDbData_1.2.1 affy_1.62.0            reshape2_1.4.3        
##  [40] Rcpp_1.0.2             cellranger_1.1.0       vctrs_0.2.0           
##  [43] preprocessCore_1.46.0  gdata_2.18.0           nlme_3.1-141          
##  [46] crosstalk_1.0.0        xfun_0.9               testthat_2.2.1        
##  [49] rvest_0.3.4            mime_0.7               gtools_3.8.1          
##  [52] XML_3.98-1.20          zlibbioc_1.30.0        scales_1.0.0          
##  [55] promises_1.0.1         hms_0.5.1              yaml_2.2.0            
##  [58] memoise_1.1.0          gridExtra_2.3          rpart_4.1-15          
##  [61] latticeExtra_0.6-28    stringi_1.4.3          RSQLite_2.1.2         
##  [64] highr_0.8              genefilter_1.66.0      checkmate_1.9.4       
##  [67] caTools_1.17.1.2       rlang_0.4.0            pkgconfig_2.0.2       
##  [70] bitops_1.0-6           evaluate_0.14          labeling_0.3          
##  [73] htmlwidgets_1.3        bit_1.1-14             tidyselect_0.2.5      
##  [76] plyr_1.8.4             magrittr_1.5           R6_2.4.0              
##  [79] generics_0.0.2         Hmisc_4.2-0            DBI_1.0.0             
##  [82] pillar_1.4.2           haven_2.1.1            foreign_0.8-72        
##  [85] withr_2.1.2            survival_2.44-1.1      RCurl_1.95-4.12       
##  [88] nnet_7.3-12            modelr_0.1.5           crayon_1.3.4          
##  [91] KernSmooth_2.23-15     rmarkdown_1.15         locfit_1.5-9.1        
##  [94] readxl_1.3.1           blob_1.2.0             digest_0.6.20         
##  [97] xtable_1.8-4           httpuv_1.5.1           munsell_0.5.0         
## [100] viridisLite_0.3.0